#pip install missingno


# Use this cell to begin, and add as many cells as you need to complete your analysis!
# Importing necessary packages

import pandas as pd
import seaborn as sns
import missingno as mso
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import numpy as np
from scipy import stats
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE


# Accessing the data
df=pd.read_csv('data/coffee_shops.csv')


# Printing first 10 lines of the data
df.head(10)


# Additional information about the data and counting missing Values in the data
print('------------------------------------------General Information About the Data------------------------------------')
print(df.info())
print('------------------------------------------------Count of Missing Values-----------------------------------------')
print(df.isna().sum())

------------------------------------------General Information About the Data------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Region           200 non-null    object 
 1   Place name       200 non-null    object 
 2   Place type       200 non-null    object 
 3   Rating           198 non-null    float64
 4   Reviews          198 non-null    float64
 5   Price            122 non-null    object 
 6   Delivery option  129 non-null    object 
 7   Dine in option   140 non-null    object 
 8   Takeout option   144 non-null    object 
dtypes: float64(2), object(7)
memory usage: 14.2+ KB
None
------------------------------------------------Count of Missing Values-----------------------------------------
Region              0
Place name          0
Place type          0
Rating              2
Reviews             2
Price              78
Delivery option    71
Dine in option     60
Takeout option     56
dtype: int64


mso.matrix(df)
plt.title('Distribution of Missing Values')
plt.rcParams.update({'font.size':12,'figure.figsize':(10,5)})
plt.show()


# Sorting the columns of missing data
mso.matrix(df.sort_values(['Price','Delivery option','Dine in option','Takeout option']))
plt.title('Distribution of Sorted Missing Values ')
plt.show()


# Unsing heatmap see any relation among missing data
mso.heatmap(df,cmap='Blues')
plt.title('Correlation between Missing Values')
plt.show()


# Running a descriptive statitics for two numerical data we have Rating and Reviews
df.describe()


df[df['Reviews']>271].isna().sum()

Region              0
Place name          0
Place type          0
Rating              0
Reviews             0
Price               8
Delivery option    18
Dine in option      7
Takeout option     14
dtype: int64


df[df['Reviews']<=271].isna().sum()

Region              0
Place name          0
Place type          0
Rating              0
Reviews             0
Price              68
Delivery option    52
Dine in option     51
Takeout option     40
dtype: int64


# Plotting a box plot for Reviews
sns.boxplot(x='Reviews', data=df, palette='husl')
plt.title('Distribution of Reviews')
plt.ylabel('Reviews')
plt.xlabel('Reviews Count')
plt.show()


display(df.nlargest(10,'Reviews'))


display(df.nsmallest(10,'Reviews'))


display(df.groupby('Region')['Place name'].count())
display(df.groupby('Region')['Place name','Reviews'].agg(['count','mean','median']))

Region
Dnipro         20
Kharkiv        20
Kherson        20
Khrivoy Rog    20
Kiev           20
Lviv           20
Mariupol       20
Odessa         20
Poltava        20
Zaporozhye     20
Name: Place name, dtype: int64


display(df['Place type'].value_counts())

Coffee shop                 97
Cafe                        58
Espresso bar                20
Coffee store                11
Store                        3
Coffee roasters              2
Pastry shop                  2
Chocolate shop               1
European restaurant          1
Internet shop                1
Appliance repair service     1
Coffee stand                 1
E-commerce service           1
Restaurant                   1
Name: Place type, dtype: int64


df['Price'].value_counts(normalize=True)

$$     0.950820
$$$    0.032787
$      0.016393
Name: Price, dtype: float64


# Visualizing the Rating
sns.displot(x='Rating',data=df)
plt.title('Distribution of Reviews')
plt.show()


df['exp'] = df['Dine in option']+df['Takeout option']+df['Delivery option']
df['exp'].value_counts()

2    68
3    23
Name: exp, dtype: int64


# Plotting it by using Seaborn
sns.set_style('whitegrid')
sns.catplot(x='Rating',y='Reviews',data=df,kind='bar') 
#plt.yscale('log')
plt.title('Overall Review vs.Rating')
plt.show()


sns.set_style('whitegrid')
sns.catplot(x='Rating',y='Reviews',data=df,col='Region',col_wrap=2,kind='bar') 
#plt.yscale('log')
plt.title('Overall Review vs, Rating by Region')
plt.show()


# Droping Columns Place name and type
kmd=df.drop(['Place name','Place type'],axis=1)
# Converting price to numeric form 
# Since general convention dictates counts of $ implies cost, higher the count higher the cost
kmd['Price numeric']=kmd['Price'].str.len()
# Thus our features are:
features = kmd[['Rating','Reviews','Price numeric','Delivery option','Dine in option','Takeout option']]


# Intializing Imputer object
imp = SimpleImputer()
# Intializing Scaler object
stdscaler = StandardScaler()
# Intializing a pipeline for the imputer and scaler
pipeline =make_pipeline(imp,stdscaler)
pipe_data=pipeline.fit_transform(features)


inertia=[]
for i in range(1,10):
    kmm = KMeans(n_clusters=i,random_state=42)
    kmm.fit(pipe_data)
    inertia.append(kmm.inertia_)

plt.plot(range(1,10),inertia,marker='x')
plt.title('Scree Plot')
plt.xlabel('Clusters')
plt.ylabel('Inertia')
plt.show()


kmm6 = KMeans(n_clusters=6,random_state=42)
kmm6.fit(pipe_data)

KMeans(n_clusters=6, random_state=42)


# Intializing our TSNE model, with learning rate at 100

tsne =TSNE(learning_rate=100)

tsne_feature=tsne.fit_transform(pipe_data)
x_tsne = tsne_feature[:,0]
y_tsne = tsne_feature[:,1]

plt.scatter(x_tsne,y_tsne,c=kmm6.labels_)
plt.title('Cluster in 2D plane')
plt.show()


df['Market segment']=kmm6.labels_


Market_Feature=df.groupby(['Market segment','Region','Price','exp'])['Rating','Reviews'].mean().reset_index()


display(Market_Feature)


sns.set_style('whitegrid')
sns.catplot(x='Rating',y='Reviews',col='Market segment',col_wrap=2,data=Market_Feature) 
plt.title('Market Segmentation',y=3.3)
plt.yscale('log')
plt.xscale('log')
plt.ylabel('Review -> Footprint')
plt.xlabel('Ratings')
plt.show()

	Rating	Reviews
count	198.000000	198.000000
mean	4.663636	622.494949
std	0.221648	1400.898666
min	3.900000	3.000000
25%	4.600000	47.500000
50%	4.700000	271.500000
75%	4.800000	786.250000
max	5.000000	17937.000000

	Region	Place name	Place type	Rating	Reviews	Price	Delivery option	Dine in option	Takeout option
170	Kharkiv	Dom Kofe, Mah.	Coffee shop	4.3	3.0	NaN	NaN	NaN	NaN
190	Dnipro	"Франс.уа" кофейня-пекарня	Cafe	3.9	9.0	NaN	False	NaN	True
19	Mariupol	Coffee Drive	Coffee shop	4.8	10.0	NaN	False	True	True
82	Odessa	Кофе База Одесса \| Coffee Baza	Store	5.0	10.0	NaN	NaN	NaN	NaN
99	Odessa	Coffee break ZAVIDA	Coffee shop	4.9	10.0	NaN	NaN	NaN	NaN
116	Lviv	Liberty	Cafe	5.0	10.0	NaN	NaN	NaN	True
160	Kharkiv	КофеТочка Харьков - интернет-магазин кофе и ко...	Coffee store	3.9	10.0	NaN	NaN	NaN	NaN
2	Mariupol	Кофейня Світ Чаю	Coffee shop	5.0	11.0	NaN	NaN	NaN	True
60	Zaporozhye	Art Coffee	Espresso bar	4.6	11.0	NaN	NaN	NaN	NaN
4	Mariupol	Кофейня "Friend Zone"	Coffee shop	5.0	12.0	NaN	NaN	True	True

Case Study¶

Problem Statement (Provided)¶

My Analysis¶

Exploring the data¶

Conclusion from Exploratory Data Analysis¶

Model Building¶

Conclusion of Clustering the data¶

Overall Conclusion¶

	Region	Place name	Place type	Rating	Reviews	Price	Delivery option	Dine in option	Takeout option
0	Mariupol	Dim Kavu	Coffee store	4.6	206.0	NaN	NaN	NaN	NaN
1	Mariupol	Коферум	Cafe	5.0	24.0	$$	False	NaN	True
2	Mariupol	Кофейня Світ Чаю	Coffee shop	5.0	11.0	NaN	NaN	NaN	True
3	Mariupol	Кофейня Starcoff	Coffee shop	4.4	331.0	$$	False	True	True
4	Mariupol	Кофейня "Friend Zone"	Coffee shop	5.0	12.0	NaN	NaN	True	True
5	Mariupol	Racers Coffee Shop	Espresso bar	4.6	367.0	$$	False	True	True
6	Mariupol	Займемся Кофе	Coffee shop	4.6	200.0	$$	False	True	True
7	Mariupol	Кофейня Rit Rit	Coffee shop	4.6	292.0	$$	False	NaN	True
8	Mariupol	Кав'ярня My coffee	Coffee shop	4.8	31.0	NaN	NaN	True	True
9	Mariupol	LENЬ. Coffee & desserts.	Coffee shop	4.8	125.0	$$	True	True	True

	Region	Place name	Place type	Rating	Reviews	Price	Delivery option	Dine in option	Takeout option
100	Lviv	Lviv Coffee Manufacture	Coffee shop	4.7	17937.0	$$	True	True	True
101	Lviv	Svit Kavy	Coffee shop	4.6	2931.0	$$	NaN	True	True
165	Kharkiv	Kofeyin	Coffee shop	4.5	2914.0	$$	False	True	True
110	Lviv	Вірменка	Coffee shop	4.8	2873.0	$$	False	True	True
81	Odessa	Traveler`s Coffee	Coffee shop	4.6	2700.0	$$	True	True	True
180	Dnipro	DoubleDecker Cake and Coffee	Pastry shop	4.7	2381.0	$$$	True	True	True
148	Kiev	ONE LOVE espresso bar	Coffee shop	4.6	2319.0	$$	False	True	True
117	Lviv	Lviv Croissants	Cafe	4.5	2236.0	$$	False	True	True
89	Odessa	Zheto	Coffee shop	4.8	2141.0	$$	NaN	True	True
102	Lviv	Na bambetli	Coffee shop	4.7	2089.0	$$	False	True	True

	Reviews
	count	mean	median
Region
Dnipro	19	589.947368	189.0
Kharkiv	20	544.450000	278.5
Kherson	19	370.157895	275.0
Khrivoy Rog	20	461.900000	334.5
Kiev	20	764.600000	443.5
Lviv	20	1938.650000	933.5
Mariupol	20	193.800000	151.0
Odessa	20	422.000000	95.5
Poltava	20	535.250000	338.0
Zaporozhye	20	389.950000	195.0

	Market segment	Region	Price	exp	Rating	Reviews
0	0	Kharkiv	$$	2	4.800000	261.000000
1	0	Kiev	$$	2	4.800000	824.000000
2	0	Lviv	$$	2	4.866667	1117.000000
3	0	Odessa	$$	2	4.800000	150.000000
4	0	Poltava	$$	2	4.800000	124.000000
5	0	Zaporozhye	$$	2	4.800000	807.000000
6	1	Dnipro	$$$	3	4.700000	2381.000000
7	1	Khrivoy Rog	$$$	3	4.600000	327.000000
8	2	Lviv	$$	3	4.700000	17937.000000
9	3	Dnipro	$$	3	4.550000	1290.000000
10	3	Kharkiv	$$	3	4.600000	118.000000
11	3	Kherson	$$	3	4.600000	592.500000
12	3	Khrivoy Rog	$$	3	4.250000	1218.500000
13	3	Kiev	$$	3	4.600000	1908.000000
14	3	Lviv	$$	3	4.600000	1570.000000
15	3	Mariupol	$$	3	4.700000	174.250000
16	3	Odessa	$$	3	4.600000	1744.500000
17	3	Poltava	$$	3	4.500000	1195.000000
18	4	Kiev	$	2	4.600000	1300.000000
19	5	Dnipro	$$	2	4.550000	707.500000
20	5	Kharkiv	$$	2	4.625000	729.000000
21	5	Kherson	$$	2	4.542857	482.571429
22	5	Khrivoy Rog	$$	2	4.580000	749.000000
23	5	Kiev	$$	2	4.650000	1201.750000
24	5	Lviv	$$	2	4.577778	1199.000000
25	5	Mariupol	$$	2	4.533333	299.333333
26	5	Odessa	$$	2	4.700000	268.500000
27	5	Poltava	$$	2	4.400000	804.000000
28	5	Zaporozhye	$$	2	4.520000	493.000000